The zinb model doesn’t seem to work well in the Fluidigm data. This is independent of normalization, meaning that both unnormalized data and normalized data (both with scran and full-quantile) lead to bad results.
PCA leads to better results, even though, as many people have shown, the first component is highly correlated to detection rate, even after normalization.
ZIFA seems to work well in the Fluidigm datasets, suggesting that we should be able to find a way to make zinb work as well.
One key difference between ZIFA and zinb is that in our model both \(\mu\) and \(\pi\) depend on \(W\), while in the ZIFA model only \(\mu\) does. We should consider a model that doesn’t have a \(W\) in \(\pi\).
It seems (preliminary analyses) that removing the intercept in V makes the weird W behavior go away. Still, I think that we should add the ability to remove W from \(\pi\) and see what is the difference.
Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 6998
Fit a zinb model on the high coverage data (no normalization).
fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
## user system elapsed
## 683.473 84.752 297.057
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)
plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147
sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")
plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")
I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.
set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 789.406 95.665 341.431
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622
se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 875.791 98.163 370.769
Plot the results with cells colored according to their biological condition.
plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)
plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287
Correlation between PCA and ZINB
cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199
## write matrices to file to feed to ZIFA in python
write.csv(log1p(high), file="logcounts_high.csv")
write.csv(log1p(norm), file="logcounts_high_scran.csv")
Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 2506
Fit a zinb model on the low coverage data (no normalization).
fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
## user system elapsed
## 403.672 67.831 177.612
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)
plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612
sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 279.736 55.275 126.172
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538
x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
## user system elapsed
## 323.065 52.122 140.469
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
## write matrices to file to feed to ZIFA in python
write.csv(log1p(low), file="logcounts_low.csv")
write.csv(log1p(norm), file="logcounts_low_scran.csv")
write.csv(log1p(low[1:5, 1:3]), file="logcounts_toy.csv")
zifa_res <- read.csv("zifa_low.csv", header=FALSE)
plot(zifa_res, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
library(biomaRt)
mart <- useMart("ensembl")
mart <- useDataset("hsapiens_gene_ensembl", mart = mart)
attrs <- c("hgnc_symbol", "entrezgene")
bm <- getBM(attributes=attrs, mart = mart)
bm <- bm[match(rownames(low), bm[,1]),]
bm <- na.omit(bm)
low <- low[bm[,1],]
gene_info <- getGeneLengthAndGCContent(as.character(bm[,2]), "hsa", mode="org.db")
rownames(gene_info) <- bm[,1]
gene_info <- na.omit(gene_info)
low <- low[rownames(gene_info),]
NB: currently, there is no way to have no columns of X or V in either \(\mu\) or \(\pi\). We should let the user specify NULL?
V <- cbind(rep(0, NROW(low)), gene_info)
print(system.time(zinb_gc <- zinbFit(low, ncores = 3, K = 2, V = V, which_V_mu=1L, which_V_pi=2:3)))
## user system elapsed
## 189.987 40.693 88.167
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)
print(system.time(zinb_nov <- zinbFit(low, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(low)))))
## user system elapsed
## 246.570 47.265 111.521
plot(zinb_nov@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_nov@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)
V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
print(system.time(zinb_v <- zinbFit(low, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
## user system elapsed
## 145.741 21.524 67.725
plot(zinb_v@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)
V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
print(system.time(zinb_v2 <- zinbFit(low, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
## user system elapsed
## 320.539 40.633 145.828
plot(zinb_v2@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)
V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
X <- cbind(rep(0, NCOL(low)), rep(1, NCOL(low)))
print(system.time(zinb_vx <- zinbFit(low, ncores = 3, K = 2, V=V, X=X, which_V_mu=1L, which_V_pi=2L, which_X_mu=2L, which_X_pi=1L)))
## user system elapsed
## 272.078 54.187 121.909
plot(zinb_vx@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)